load in the data and convert to a data frame
revisions <- do.call(rbind,strsplit(readLines("data/all_revisions_1000_articles.txt"), "<<sep>>",fixed=T))
head(revisions)
## [,1] [,2] [,3] [,4]
## [1,] "47" "233248" "AbalonE" "2001-01-28T09:38:56Z"
## [2,] "3527" "383723" "Military_of_Bassas_da_India" "2002-02-25T15:51:15Z"
## [3,] "6330" "243308" "Clement_Martyn_Doke" "2001-09-13T17:30:56Z"
## [4,] "6330" "882117" "Clement_Martyn_Doke" "2002-02-25T15:51:15Z"
## [5,] "6330" "4893101" "Clement_Martyn_Doke" "2003-05-02T10:50:27Z"
## [6,] "6330" "4931724" "Clement_Martyn_Doke" "2004-07-28T22:05:55Z"
## [,5] [,6] [,7] [,8]
## [1,] "BryceHarrington" "3684" "" ""
## [2,] "ip:Conversion_script" "ip:Conversion_script" "" ""
## [3,] "BenBaker" "256" "" ""
## [4,] "ip:Conversion_script" "ip:Conversion_script" "" ""
## [5,] "JohnOwens" "4558" "" ""
## [6,] "Evertype" "58589" "" ""
## [,9]
## [1,] ""
## [2,] "Bassas_da_India"
## [3,] "1980 Linguist 1893 South_African"
## [4,] "1980 Linguist 1893 Africa South_African"
## [5,] "1980 Linguist 1893 Africa South_African"
## [6,] "1980 1893 Africa South_African Click_consonant Bantu_languages Khoisan_languages Linguist Zulu_language"
## [,10] [,11] [,12] [,13] [,14] [,15]
## [1,] "" "" "" "" "" ""
## [2,] "" "" "" "" "" ""
## [3,] "" "" "" "" "" ""
## [4,] "" "" "" "" "" ""
## [5,] "" "" "" "" "" ""
## [6,] "" "" "" "" "" ""
## [,16]
## [1,] "I had scallops for dinner tonight; wonder what Abalone taste like...? Compliments of EB and PG"
## [2,] "Automated conversion"
## [3,] "*"
## [4,] "Automated conversion"
## [5,] "sentence"
## [6,] ""
## [,17] [,18]
## [1,] "0" "118"
## [2,] "1" "16"
## [3,] "1" "8"
## [4,] "1" "15"
## [5,] "1" "16"
## [6,] "0" "47"
revisions_processed <-
setNames(
as.data.frame(lapply(1:ncol(revisions), function (i) {
type.convert(revisions[,i], as.is = TRUE)
}), stringsAsFactors = FALSE),
c("article_id", "rev_id", "article_title", "timestamp", "[ip:]username", "user_id", "CATEGORY", "IMAGE", "MAIN", "TALK", "USER", "USER_TALK", "OTHER", "EXTERNAL",
"TEMPLATE", "COMMENT", "MINOR", "TEXTDATA")
)
library(tidyr)
revisions_processed <- separate(data = revisions_processed, col = timestamp, into = c('date', 'time'), sep = "T")
str(revisions_processed)
## 'data.frame': 42363 obs. of 19 variables:
## $ article_id : int 47 3527 6330 6330 6330 6330 6330 6330 6330 6330 ...
## $ rev_id : int 233248 383723 243308 882117 4893101 4931724 4931731 5518069 5541683 5629426 ...
## $ article_title: chr "AbalonE" "Military_of_Bassas_da_India" "Clement_Martyn_Doke" "Clement_Martyn_Doke" ...
## $ date : chr "2001-01-28" "2002-02-25" "2001-09-13" "2002-02-25" ...
## $ time : chr "09:38:56Z" "15:51:15Z" "17:30:56Z" "15:51:15Z" ...
## $ [ip:]username: chr "BryceHarrington" "ip:Conversion_script" "BenBaker" "ip:Conversion_script" ...
## $ user_id : chr "3684" "ip:Conversion_script" "256" "ip:Conversion_script" ...
## $ CATEGORY : chr "" "" "" "" ...
## $ IMAGE : chr "" "" "" "" ...
## $ MAIN : chr "" "Bassas_da_India" "1980 Linguist 1893 South_African" "1980 Linguist 1893 Africa South_African" ...
## $ TALK : chr "" "" "" "" ...
## $ USER : chr "" "" "" "" ...
## $ USER_TALK : chr "" "" "" "" ...
## $ OTHER : chr "" "" "" "" ...
## $ EXTERNAL : chr "" "" "" "" ...
## $ TEMPLATE : chr "" "" "" "" ...
## $ COMMENT : chr "I had scallops for dinner tonight; wonder what Abalone taste like...? Compliments of EB and PG" "Automated conversion" "*" "Automated conversion" ...
## $ MINOR : int 0 1 1 1 1 0 0 0 1 1 ...
## $ TEXTDATA : int 118 16 8 15 16 47 53 54 54 54 ...
head(revisions_processed)
## article_id rev_id article_title date time
## 1 47 233248 AbalonE 2001-01-28 09:38:56Z
## 2 3527 383723 Military_of_Bassas_da_India 2002-02-25 15:51:15Z
## 3 6330 243308 Clement_Martyn_Doke 2001-09-13 17:30:56Z
## 4 6330 882117 Clement_Martyn_Doke 2002-02-25 15:51:15Z
## 5 6330 4893101 Clement_Martyn_Doke 2003-05-02 10:50:27Z
## 6 6330 4931724 Clement_Martyn_Doke 2004-07-28 22:05:55Z
## [ip:]username user_id CATEGORY IMAGE
## 1 BryceHarrington 3684
## 2 ip:Conversion_script ip:Conversion_script
## 3 BenBaker 256
## 4 ip:Conversion_script ip:Conversion_script
## 5 JohnOwens 4558
## 6 Evertype 58589
## MAIN
## 1
## 2 Bassas_da_India
## 3 1980 Linguist 1893 South_African
## 4 1980 Linguist 1893 Africa South_African
## 5 1980 Linguist 1893 Africa South_African
## 6 1980 1893 Africa South_African Click_consonant Bantu_languages Khoisan_languages Linguist Zulu_language
## TALK USER USER_TALK OTHER EXTERNAL TEMPLATE
## 1
## 2
## 3
## 4
## 5
## 6
## COMMENT
## 1 I had scallops for dinner tonight; wonder what Abalone taste like...? Compliments of EB and PG
## 2 Automated conversion
## 3 *
## 4 Automated conversion
## 5 sentence
## 6
## MINOR TEXTDATA
## 1 0 118
## 2 1 16
## 3 1 8
## 4 1 15
## 5 1 16
## 6 0 47
I want to grab all the categories for each article_id
categories <- revisions_processed %>%
group_by(article_id) %>%
summarise(CATEGORIES=paste(CATEGORY, collapse = " "))
## Warning: package 'bindrcpp' was built under R version 3.4.4
head(categories, 10)
## # A tibble: 10 x 2
## article_id CATEGORIES
## <int> <chr>
## 1 47 ""
## 2 3527 ""
## 3 6330 " Notable_South_Africans Notable_South_Africans Notable_…
## 4 10864 " Free_software Free_software Free_software Fre…
## 5 20072 " Assemblers Assemblers Assemblers Assemble…
## 6 21494 " …
## 7 26582 " Scottish_pol…
## 8 28056 " "
## 9 30059 " …
## 10 31025 " "
categories$CATEGORIES <- sapply(strsplit(categories$CATEGORIES, split=" "), function(x) {
paste0(unique(trimws(x)), collapse = ', ')
})
library(tidyverse)
## ── Attaching packages ────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
categoriesdf <- as.data.frame(str_split_fixed(categories$CATEGORIES, ", ", max(unlist(lapply(strsplit(categories$CATEGORIES, ", "), length)))))
categoriesdf <- categoriesdf[,-1]
names(categoriesdf) <- paste0("category_", 1:ncol(categoriesdf))
categoriesdf <- cbind(article_id = categories$article_id, categoriesdf)
head(categoriesdf, 10)
## article_id category_1 category_2
## 1 47
## 2 3527
## 3 6330 Notable_South_Africans South_African_people
## 4 10864 Free_software Free_software_licenses
## 5 20072 Assemblers
## 6 21494 Subcultures Sociolinguistics
## 7 26582 Scottish_politicians History_of_Scotland
## 8 28056
## 9 30059 Turkey Greek_mythology
## 10 31025
## category_3 category_4 category_5
## 1
## 2
## 3 Linguists 1980_deaths 1893_births
## 4 Software_licenses Libre
## 5
## 6 Slang Stereotypes Customary_categories_of_people
## 7 Scotland Scottish_monarchs 1329_deaths
## 8
## 9 Archaeology Hittite_Empire Ancient_Greek_cities
## 10
## category_6 category_7 category_8
## 1
## 2
## 3 South_African_linguists
## 4
## 5
## 6 Nerds Slang_expressions Stock_characters
## 7 1274_births House_of_Bruce Defectors
## 8
## 9 Geography_of_Turkey Trojans Archaeological_sites
## 10
## category_9 category_10
## 1
## 2
## 3
## 4
## 5
## 6 People Sfasdfalang_expressions
## 7 Earls_in_the_Peerage_of_Scotland Guardians_of_Scotland
## 8
## 9 Archaeological_sites_in_Turkey Lost_cities_and_towns
## 10
## category_11 category_12 category_13
## 1
## 2
## 3
## 4
## 5
## 6 Anti-intellectualism
## 7 Wars_of_Scottish_Independence Dumfries_and_Galloway Medieval_Gaels
## 8
## 9 Destroyed_cities Greek_sites_in_Turkey Eccentrics
## 10
## category_14 category_15
## 1
## 2
## 3
## 4
## 5
## 6
## 7 Scottish_people Medieval_Scotland
## 8
## 9 Patent_clerks Jewish-American_scientists
## 10
## category_16 category_17 category_18
## 1
## 2
## 3
## 4
## 5
## 6
## 7 Natives_of_Dumfries_and_Galloway High_Kings_of_Ireland Revolutionaries
## 8
## 9 1879_births Humanitarians Humanists
## 10
## category_19 category_20 category_21
## 1
## 2
## 3
## 4
## 5
## 6
## 7 Rebels Scottish_Roman_Catholics House_of_Glover
## 8
## 9 1955_deaths Vegetarians Refugees
## 10
## category_22
## 1
## 2
## 3
## 4
## 5
## 6
## 7 People_excommunicated_by_the_Roman_Catholic_Church
## 8
## 9 Naturalized_citizens_of_the_United_States
## 10
## category_23 category_24
## 1
## 2
## 3
## 4
## 5
## 6
## 7 People_from_Dumfries_and_Galloway Scottish_rebels
## 8
## 9 Socialists Social_justice
## 10
## category_25 category_26
## 1
## 2
## 3
## 4
## 5
## 6
## 7 People_from_Ayrshire People_from_South_Ayrshire
## 8
## 9 Contributors_to_general_relativity Autodidacts
## 10
## category_27 category_28
## 1
## 2
## 3
## 4
## 5
## 6
## 7 Scottish_Gaelic-speaking_people
## 8
## 9 Jewish_scientists Natives_of_Baden-Württemberg
## 10
## category_29 category_30 category_31 category_32
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 Manhattan_Project Cosmologists German_scientists German-Americans
## 10
## category_33 category_34 category_35 category_36
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 World_federalists Erdős_number_2 Physicists Formerly_stateless_people
## 10
## category_37 category_38 category_39
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 Nobel_Prize_in_Physics_winners German_physicists Albert_Einstein
## 10
## category_40 category_41
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 Aegean_civilization Ancient_Greek_sites_in_Turkey
## 10
## category_42 category_43
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 World_Heritage_Sites_in_Turkey Locations_in_Greek_mythology
## 10
## category_44
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 National_parks_of_Turkey
## 10
str(categoriesdf)
## 'data.frame': 1015 obs. of 45 variables:
## $ article_id : int 47 3527 6330 10864 20072 21494 26582 28056 30059 31025 ...
## $ category_1 : Factor w/ 632 levels "","_1934_births",..: 1 1 452 301 124 547 516 1 588 1 ...
## $ category_2 : Factor w/ 501 levels "","_1963_deaths",..: 1 1 424 244 1 420 277 1 264 1 ...
## $ category_3 : Factor w/ 372 levels "","_1870_births",..: 1 1 231 321 1 319 309 1 87 1 ...
## $ category_4 : Factor w/ 288 levels "","_1920_Year_of_birth",..: 1 1 37 187 1 263 249 1 168 1 ...
## $ category_5 : Factor w/ 205 levels "","_Cancelled_PC_games",..: 1 1 17 1 1 85 11 1 56 1 ...
## $ category_6 : Factor w/ 167 levels "","_Computer_and_video_games_based_on_licensed_properties",..: 1 1 149 1 1 120 10 1 88 1 ...
## $ category_7 : Factor w/ 129 levels "","_Animated_Television_Series",..: 1 1 1 1 1 113 71 1 121 1 ...
## $ category_8 : Factor w/ 98 levels "","_1897_births",..: 1 1 1 1 1 94 40 1 24 1 ...
## $ category_9 : Factor w/ 77 levels "","_Irish-Americans",..: 1 1 1 1 1 52 22 1 9 1 ...
## $ category_10: Factor w/ 66 levels "","_6teen_Characters",..: 1 1 1 1 1 59 33 1 38 1 ...
## $ category_11: Factor w/ 50 levels "","1956_births",..: 1 1 1 1 1 14 49 1 21 1 ...
## $ category_12: Factor w/ 41 levels "","1920_births",..: 1 1 1 1 1 1 16 1 21 1 ...
## $ category_13: Factor w/ 36 levels "","6teen_characters",..: 1 1 1 1 1 1 19 1 14 1 ...
## $ category_14: Factor w/ 29 levels "","_PlayStation_Portable_games",..: 1 1 1 1 1 1 21 1 18 1 ...
## $ category_15: Factor w/ 24 levels "","1976_deaths",..: 1 1 1 1 1 1 15 1 12 1 ...
## $ category_16: Factor w/ 21 levels "","1879_births",..: 1 1 1 1 1 1 12 1 2 1 ...
## $ category_17: Factor w/ 18 levels "","American_baseball_players",..: 1 1 1 1 1 1 11 1 12 1 ...
## $ category_18: Factor w/ 17 levels "","Albany_Capitals_players",..: 1 1 1 1 1 1 17 1 9 1 ...
## $ category_19: Factor w/ 13 levels "","1955_deaths",..: 1 1 1 1 1 1 11 1 2 1 ...
## $ category_20: Factor w/ 12 levels "","Miami_Vice_cast_members",..: 1 1 1 1 1 1 7 1 12 1 ...
## $ category_21: Factor w/ 9 levels "","24_(TV_series)_cast_members",..: 1 1 1 1 1 1 6 1 9 1 ...
## $ category_22: Factor w/ 8 levels "","African_American_sportspeople",..: 1 1 1 1 1 1 8 1 6 1 ...
## $ category_23: Factor w/ 7 levels "","Christianity_in_Oxford",..: 1 1 1 1 1 1 5 1 6 1 ...
## $ category_24: Factor w/ 5 levels "","Fame_(TV_series)_cast_members",..: 1 1 1 1 1 1 3 1 4 1 ...
## $ category_25: Factor w/ 4 levels "","Contributors_to_general_relativity",..: 1 1 1 1 1 1 4 1 2 1 ...
## $ category_26: Factor w/ 4 levels "","Autodidacts",..: 1 1 1 1 1 1 4 1 2 1 ...
## $ category_27: Factor w/ 3 levels "","Jewish_scientists",..: 1 1 1 1 1 1 3 1 2 1 ...
## $ category_28: Factor w/ 2 levels "","Natives_of_Baden-Württemberg": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_29: Factor w/ 2 levels "","Manhattan_Project": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_30: Factor w/ 2 levels "","Cosmologists": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_31: Factor w/ 2 levels "","German_scientists": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_32: Factor w/ 2 levels "","German-Americans": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_33: Factor w/ 2 levels "","World_federalists": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_34: Factor w/ 2 levels "","Erdős_number_2": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_35: Factor w/ 2 levels "","Physicists": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_36: Factor w/ 2 levels "","Formerly_stateless_people": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_37: Factor w/ 2 levels "","Nobel_Prize_in_Physics_winners": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_38: Factor w/ 2 levels "","German_physicists": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_39: Factor w/ 2 levels "","Albert_Einstein": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_40: Factor w/ 2 levels "","Aegean_civilization": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_41: Factor w/ 2 levels "","Ancient_Greek_sites_in_Turkey": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_42: Factor w/ 2 levels "","World_Heritage_Sites_in_Turkey": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_43: Factor w/ 2 levels "","Locations_in_Greek_mythology": 1 1 1 1 1 1 1 1 2 1 ...
## $ category_44: Factor w/ 2 levels "","National_parks_of_Turkey": 1 1 1 1 1 1 1 1 2 1 ...
Now to tidy it up: from wide to long format replace empties with NA and filter them out
library(dplyr)
df3 <- categoriesdf %>%
gather(category, CATEGORIES, -article_id) %>%
replace(. == "", NA) %>%
filter(!is.na(CATEGORIES)) %>%
select(-category) %>%
group_by(CATEGORIES) %>%
summarise(number = n()) %>%
arrange(desc(number))
## Warning: attributes are not identical across measure variables;
## they will be dropped
head(df3, 10)
## # A tibble: 10 x 2
## CATEGORIES number
## <chr> <int>
## 1 Living_people 84
## 2 Year_of_birth_missing 12
## 3 Year_of_birth_missing_(living_people) 10
## 4 Articles_for_deletion 6
## 5 2004_albums 5
## 6 Debut_albums 5
## 7 1958_births 4
## 8 1977_births 4
## 9 2007_albums 4
## 10 American_film_actors 4
str(df3)
## Classes 'tbl_df', 'tbl' and 'data.frame': 2662 obs. of 2 variables:
## $ CATEGORIES: chr "Living_people" "Year_of_birth_missing" "Year_of_birth_missing_(living_people)" "Articles_for_deletion" ...
## $ number : int 84 12 10 6 5 5 4 4 4 4 ...
Separate the date, create monthly counts
date_split <- separate(data = revisions_processed, col = date, into = c('year', 'month', 'day'), sep = "-")
monthlycounts <- date_split %>%
group_by(article_id, year, month) %>%
summarise(count = n())
arrange(monthlycounts, article_id, year, month)
revisions_to_dates <- revisions_processed
revisions_to_dates$date <- as.Date(revisions_to_dates$date, '%Y-%m-%d')
str(revisions_to_dates)
## 'data.frame': 42363 obs. of 19 variables:
## $ article_id : int 47 3527 6330 6330 6330 6330 6330 6330 6330 6330 ...
## $ rev_id : int 233248 383723 243308 882117 4893101 4931724 4931731 5518069 5541683 5629426 ...
## $ article_title: chr "AbalonE" "Military_of_Bassas_da_India" "Clement_Martyn_Doke" "Clement_Martyn_Doke" ...
## $ date : Date, format: "2001-01-28" "2002-02-25" ...
## $ time : chr "09:38:56Z" "15:51:15Z" "17:30:56Z" "15:51:15Z" ...
## $ [ip:]username: chr "BryceHarrington" "ip:Conversion_script" "BenBaker" "ip:Conversion_script" ...
## $ user_id : chr "3684" "ip:Conversion_script" "256" "ip:Conversion_script" ...
## $ CATEGORY : chr "" "" "" "" ...
## $ IMAGE : chr "" "" "" "" ...
## $ MAIN : chr "" "Bassas_da_India" "1980 Linguist 1893 South_African" "1980 Linguist 1893 Africa South_African" ...
## $ TALK : chr "" "" "" "" ...
## $ USER : chr "" "" "" "" ...
## $ USER_TALK : chr "" "" "" "" ...
## $ OTHER : chr "" "" "" "" ...
## $ EXTERNAL : chr "" "" "" "" ...
## $ TEMPLATE : chr "" "" "" "" ...
## $ COMMENT : chr "I had scallops for dinner tonight; wonder what Abalone taste like...? Compliments of EB and PG" "Automated conversion" "*" "Automated conversion" ...
## $ MINOR : int 0 1 1 1 1 0 0 0 1 1 ...
## $ TEXTDATA : int 118 16 8 15 16 47 53 54 54 54 ...
revisions_to_dates$date <- format(revisions_to_dates$date, format="%Y-%m")
head(revisions_to_dates)
## article_id rev_id article_title date time
## 1 47 233248 AbalonE 2001-01 09:38:56Z
## 2 3527 383723 Military_of_Bassas_da_India 2002-02 15:51:15Z
## 3 6330 243308 Clement_Martyn_Doke 2001-09 17:30:56Z
## 4 6330 882117 Clement_Martyn_Doke 2002-02 15:51:15Z
## 5 6330 4893101 Clement_Martyn_Doke 2003-05 10:50:27Z
## 6 6330 4931724 Clement_Martyn_Doke 2004-07 22:05:55Z
## [ip:]username user_id CATEGORY IMAGE
## 1 BryceHarrington 3684
## 2 ip:Conversion_script ip:Conversion_script
## 3 BenBaker 256
## 4 ip:Conversion_script ip:Conversion_script
## 5 JohnOwens 4558
## 6 Evertype 58589
## MAIN
## 1
## 2 Bassas_da_India
## 3 1980 Linguist 1893 South_African
## 4 1980 Linguist 1893 Africa South_African
## 5 1980 Linguist 1893 Africa South_African
## 6 1980 1893 Africa South_African Click_consonant Bantu_languages Khoisan_languages Linguist Zulu_language
## TALK USER USER_TALK OTHER EXTERNAL TEMPLATE
## 1
## 2
## 3
## 4
## 5
## 6
## COMMENT
## 1 I had scallops for dinner tonight; wonder what Abalone taste like...? Compliments of EB and PG
## 2 Automated conversion
## 3 *
## 4 Automated conversion
## 5 sentence
## 6
## MINOR TEXTDATA
## 1 0 118
## 2 1 16
## 3 1 8
## 4 1 15
## 5 1 16
## 6 0 47
collapsed <- revisions_to_dates %>%
group_by(article_id, date) %>%
summarise(count = n())
arrange(collapsed, desc(count))
## # A tibble: 8,856 x 3
## # Groups: article_id [1,015]
## article_id date count
## <int> <chr> <int>
## 1 8312072 2006-12 297
## 2 21494 2006-11 249
## 3 21494 2007-01 241
## 4 21494 2007-05 232
## 5 21494 2007-04 206
## 6 275510 2007-03 200
## 7 21494 2006-10 199
## 8 21494 2007-02 197
## 9 21494 2007-09 197
## 10 8918937 2007-07 185
## # ... with 8,846 more rows
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# USE THIS
ggplot(data = collapsed,
aes(x=as.yearmon(date),
y = count,
colour = article_id)) +
geom_line(aes(group = article_id)) +
geom_point(size=1.3) +
scale_color_gradient2(midpoint = 7000000, labels=comma) +
labs(title = "Revisions by Month for 1000 Sampled Article IDs (2001-2008)", colour = "Article ID") +
xlab("Year-Month") +
ylab("Number of Revisions")
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
# +
# theme(plot.title = element_text(size = rel(1.3)), axis.ticks.length = unit(.25, "cm"))
ggsave('img/article-revisions-by-month-sample1.png',
plot = last_plot(),
width = 10,
height = 6)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
It could be interesting to look at which articles have such high amounts of revisions.
For now, let’s normalize by total number of revisions to get percentages.
percents <- collapsed %>%
group_by(article_id) %>%
mutate(percent = count/sum(count))
library(zoo)
# USE THIS
ggplot(data = percents,
aes(x=as.yearmon(date),
y = percent,
colour = article_id)) +
geom_line(aes(group = article_id)) +
geom_point() +
geom_point(size=1.3) +
scale_color_gradient2(midpoint = 7000000, labels=comma) +
labs(title = "Revisions Per Month Normalized by Total Revisions for 1000 Sampled Article IDs (2001-2008)", colour = "Article ID") +
xlab("Year-Month") +
ylab("Percent of Article Revisions")
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
ggsave('img/article-revisions-normalized-by-month-sample1.png',
plot = last_plot(),
width = 10,
height = 6)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
Plot this with first revision to get plot to prove our point about article ids and article age. Looks like articles that were created after 2006, might not have had time to show true trends of the lifecycle of the article. Let’s only look at records that were first edited before 2006.
# this won't work if the data isn't sorted by date
# date of first revision for each article id
# USE THIS - prepare background slide for why we cut off all 2003
t.first <- percents[!duplicated(percents$article_id),]
arrange(t.first, desc(date))
t.first.before2003 <- subset(t.first, as.yearmon(date) < as.yearmon("2003-01"))
arrange(t.first.before2003, desc(date))
So now we know which article_id’s were created before 2003. We need to subset the data for these article_ids
filtered_revisions <- percents[percents$article_id %in% t.first.before2003$article_id, ]
arrange(filtered_revisions, article_id)
ggplot(data = filtered_revisions, aes(x=as.yearmon(date), y = percent, colour = article_id)) + geom_line(aes(group = article_id)) + geom_vline(xintercept = as.yearmon("2003-01"), colour = "red") + geom_point()
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
ggsave('img-other/articles-normalized-by-total-subsample1.png',
plot = last_plot(),
width = 10,
height = 6)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
ggplot(data = filtered_revisions, aes(x=as.yearmon(date), y = percent, colour = article_id)) + geom_line(aes(group = article_id)) + geom_vline(xintercept = as.yearmon("2003-01"), colour = "red") + ylim(0,.5) + geom_point()
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Warning: Removed 5 rows containing missing values (geom_point).
ggsave('img-other/articles-normalized-by-total-truncated-subsample1.png',
plot = last_plot(),
width = 10,
height = 6)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Warning: Removed 5 rows containing missing values (geom_point).
Convert records to time since first revision
aligned_revisions <- filtered_revisions
aligned_revisions$first <- t.first$date[match(aligned_revisions$article_id, t.first$article_id)]
aligned_revisions$time.since.creation <- (as.yearmon(aligned_revisions$date) - as.yearmon(aligned_revisions$first))*12
aligned_revisions$time.since.creation <- as.integer(round(aligned_revisions$time.since.creation))
library(scales)
ggplot(data = aligned_revisions, aes(x=time.since.creation/12, y = percent, colour = article_id)) + geom_line(aes(group = article_id)) + scale_y_continuous(labels = scales::percent) + geom_point()
ggsave('img-other/articles-time-since-first-creation-subsample1.png',
plot = last_plot(),
width = 10,
height = 6)
Okay so there might be some trends that are obfuscated because we do not have 0% for months where there are no edits.
library(data.table)
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
all_months <- as.data.frame(dcast(setDT(aligned_revisions), article_id ~ time.since.creation, value.var='percent'))
all_months[is.na(all_months)] <- 0
head(all_months)
## article_id 0 1 2 3 4 5 6 7 8 9
## 1 47 1.0000000000 0 0.000000000 0 0 0.0000000000 0 0 0.000000000 0
## 2 3527 1.0000000000 0 0.000000000 0 0 0.0000000000 0 0 0.000000000 0
## 3 6330 0.0285714286 0 0.000000000 0 0 0.0285714286 0 0 0.000000000 0
## 4 10864 0.0035971223 0 0.003597122 0 0 0.0000000000 0 0 0.003597122 0
## 5 20072 0.0434782609 0 0.000000000 0 0 0.0000000000 0 0 0.000000000 0
## 6 21494 0.0002515723 0 0.000000000 0 0 0.0002515723 0 0 0.000000000 0
## 10 11 12 13 14 15 16 17
## 1 0 0 0.0000000000 0.0000000000 0.000000000 0 0 0.0000000000
## 2 0 0 0.0000000000 0.0000000000 0.000000000 0 0 0.0000000000
## 3 0 0 0.0000000000 0.0000000000 0.000000000 0 0 0.0000000000
## 4 0 0 0.0035971223 0.0000000000 0.000000000 0 0 0.0000000000
## 5 0 0 0.0000000000 0.0000000000 0.000000000 0 0 0.0434782609
## 6 0 0 0.0002515723 0.0002515723 0.000754717 0 0 0.0002515723
## 18 19 20 21 22 23
## 1 0.0000000000 0.000000000 0.00000000 0 0.0000000000 0.0000000000
## 2 0.0000000000 0.000000000 0.00000000 0 0.0000000000 0.0000000000
## 3 0.0000000000 0.000000000 0.02857143 0 0.0000000000 0.0000000000
## 4 0.0000000000 0.000000000 0.00000000 0 0.0000000000 0.0035971223
## 5 0.0000000000 0.000000000 0.00000000 0 0.0000000000 0.0000000000
## 6 0.0005031447 0.000754717 0.00000000 0 0.0002515723 0.0005031447
## 24 25 26 27 28 29 30 31
## 1 0.000000000 0 0 0 0.000000000 0.0000000000 0.000000000 0.000000000
## 2 0.000000000 0 0 0 0.000000000 0.0000000000 0.000000000 0.000000000
## 3 0.000000000 0 0 0 0.000000000 0.0000000000 0.000000000 0.000000000
## 4 0.007194245 0 0 0 0.000000000 0.0000000000 0.000000000 0.000000000
## 5 0.000000000 0 0 0 0.000000000 0.0000000000 0.000000000 0.043478261
## 6 0.000000000 0 0 0 0.001257862 0.0002515723 0.002012579 0.001509434
## 32 33 34 35 36 37
## 1 0.000000000 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
## 2 0.000000000 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
## 3 0.000000000 0.000000000 0.085714286 0.05714286 0.057142857 0.000000000
## 4 0.007194245 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
## 5 0.000000000 0.000000000 0.043478261 0.00000000 0.000000000 0.043478261
## 6 0.001761006 0.008805031 0.004528302 0.00327044 0.005534591 0.005031447
## 38 39 40 41 42 43
## 1 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 2 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 3 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 4 0.000000000 0.000000000 0.000000000 0.003597122 0.010791367 0.010791367
## 5 0.000000000 0.043478261 0.043478261 0.000000000 0.000000000 0.043478261
## 6 0.008301887 0.005534591 0.003773585 0.002515723 0.007044025 0.007295597
## 44 45 46 47 48 49
## 1 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 2 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 3 0.00000000 0.00000000 0.14285714 0.05714286 0.02857143 0.05714286
## 4 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 5 0.00000000 0.00000000 0.00000000 0.08695652 0.00000000 0.00000000
## 6 0.00754717 0.01056604 0.01031447 0.01509434 0.01283019 0.01056604
## 50 51 52 53 54 55
## 1 0.000000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 2 0.000000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 3 0.028571429 0.00000000 0.00000000 0.00000000 0.00000000 0.02857143
## 4 0.003597122 0.00000000 0.04676259 0.00000000 0.01438849 0.03956835
## 5 0.000000000 0.00000000 0.00000000 0.00000000 0.00000000 0.04347826
## 6 0.011069182 0.02389937 0.02062893 0.01786164 0.02314465 0.01786164
## 56 57 58 59 60 61
## 1 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 2 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 3 0.02857143 0.00000000 0.02857143 0.00000000 0.02857143 0.02857143
## 4 0.05755396 0.03597122 0.01079137 0.01079137 0.03237410 0.01438849
## 5 0.00000000 0.08695652 0.00000000 0.00000000 0.08695652 0.00000000
## 6 0.02616352 0.03018868 0.01635220 0.01911950 0.02314465 0.05006289
## 62 63 64 65 66 67
## 1 0.000000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 2 0.000000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 3 0.000000000 0.02857143 0.02857143 0.00000000 0.00000000 0.00000000
## 4 0.007194245 0.01079137 0.03597122 0.00000000 0.06115108 0.05755396
## 5 0.130434783 0.13043478 0.04347826 0.00000000 0.00000000 0.00000000
## 6 0.062641509 0.02490566 0.06062893 0.04955975 0.04201258 0.05182390
## 68 69 70 71 72 73
## 1 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 2 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 3 0.00000000 0.02857143 0.08571429 0.00000000 0.05714286 0.00000000
## 4 0.05755396 0.01798561 0.07913669 0.10431655 0.03597122 0.04316547
## 5 0.00000000 0.00000000 0.04347826 0.00000000 0.00000000 0.00000000
## 6 0.05836478 0.04477987 0.02792453 0.02540881 0.04955975 0.02817610
## 74 75 76 77 78 79 80
## 1 0.00000000 0.000000000 0.000000000 0.00000000 0.00000000 0 0
## 2 0.00000000 0.000000000 0.000000000 0.00000000 0.00000000 0 0
## 3 0.02857143 0.000000000 0.000000000 0.00000000 0.00000000 0 0
## 4 0.03956835 0.032374101 0.014388489 0.02517986 0.05395683 0 0
## 5 0.00000000 0.000000000 0.000000000 0.00000000 0.00000000 0 0
## 6 0.04477987 0.005786164 0.004779874 0.00000000 0.00000000 0 0
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggvis)
##
## Attaching package: 'ggvis'
## The following object is masked from 'package:ggplot2':
##
## resolution
## The following objects are masked from 'package:scales':
##
## fullseq, zero_range
df1 <- melt(all_months, "article_id")
df1$variable <- as.integer(df1$variable)
ggplot(data = df1, aes(x=variable, y = value, colour = article_id)) + geom_line(aes(group = article_id)) + scale_y_continuous(labels = scales::percent) + geom_point()
ggsave('img-other/articles-normalized-by-total-melted-subsample1.png',
plot = last_plot(),
width = 10,
height = 6)
Okay so we could use a plot to show why we are throwing out articles that’s first revision was after some date. And we need to recalculate percents to be windows of 5 years. So percent of the revisions that occur in the first five years or two years?
ggplot(data = collapsed, aes(x=as.yearmon(date), y = count, colour = article_id)) + geom_line(aes(group = article_id)) + geom_point(size = 1.3) + scale_color_gradient2(midpoint = 7000000)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
ggsave('img-other/why-throwout-articles-after-first-revision-subsample1.png',
plot = last_plot(),
width = 10,
height = 6)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
Okay so we need to remove articles that were first revised after 5 years before last date (2008-01) on the date with the original counts, not percentages
filtered_revisions_counts <- collapsed[collapsed$article_id %in% t.first.before2003$article_id, ]
arrange(filtered_revisions_counts, article_id)
ggplot(data = filtered_revisions_counts, aes(x=as.yearmon(date), y = count, colour = article_id)) + geom_line(aes(group = article_id)) + geom_vline(xintercept = as.yearmon("2003-01"), colour = "red") + scale_color_gradient2(midpoint = 7000000)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
ggsave('img-other/filtered-revision-counts-subsample1.png',
plot = last_plot(),
width = 10,
height = 6)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
Okay. Now we want need to convert date to months since creations (so we can cap off the first 5 years). Convert records to time since first revision
aligned_revisions_counts <- filtered_revisions_counts
aligned_revisions_counts$first <- t.first$date[match(aligned_revisions_counts$article_id, t.first$article_id)]
aligned_revisions_counts$time.since.creation <- round((as.yearmon(aligned_revisions_counts$date) - as.yearmon(aligned_revisions_counts$first))*12)
library(scales)
ggplot(data = aligned_revisions_counts, aes(x=time.since.creation/12, y = count, colour = article_id)) + geom_line(aes(group = article_id)) + geom_point() + xlab("Years since First Revision")
ggsave('img-other/articles-months-since-first-revision-subsample1.png',
plot = last_plot(),
width = 10,
height = 6)
This is really interesting that smaller article_ids (older articles?) were heavily edited around 4-6 years after first creation. Okay now we need to convert to wide format.
library(data.table)
all_months_counts <- as.data.frame(dcast(setDT(aligned_revisions_counts), article_id ~ time.since.creation, value.var='count'))
all_months_counts[is.na(all_months_counts)] <- 0
all_months_counts
## article_id 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
## 1 47 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 3527 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 6330 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 4 10864 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 5 20072 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## 6 21494 1 0 0 0 0 1 0 0 0 0 0 0 1 1 3 0 0 1 2 3 0 0 1
## 7 26582 3 1 0 1 0 0 0 1 1 1 1 2 0 0 1 0 0 0 1 3 1 1 3
## 8 28056 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 9 30059 1 0 0 0 0 0 2 0 0 0 0 3 3 4 1 0 3 1 0 0 1 2 0
## 10 31025 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11 34422 1 0 0 0 0 0 0 0 0 3 0 0 0 0 1 0 0 1 5 3 1 0 1
## 12 34615 2 0 1 0 0 0 2 1 0 0 5 0 0 0 4 3 1 4 0 3 2 0 1
## 13 40908 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 14 54107 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 15 57122 4 0 3 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 2 4 0
## 16 78055 1 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 4 0 0
## 17 78932 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1
## 18 84081 1 0 0 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2
## 19 89237 2 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 6
## 20 98197 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## 21 101352 2 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 1 0 0 0 0 1 2
## 22 110683 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 23 110813 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 24 121543 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 25 123119 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
## 26 125806 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 27 127226 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 28 127246 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1
## 29 127922 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1
## 30 132879 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 31 133993 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 32 135427 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
## 33 136291 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2
## 34 137704 2 0 0 0 2 0 0 0 0 0 2 1 0 0 0 0 0 0 0 0 2 0 0
## 35 139330 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2
## 36 151701 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 5 0 0 2 2 0 1
## 37 152509 12 0 0 0 0 0 0 0 5 0 0 4 0 0 0 0 0 1 1 1 0 2 12
## 38 155081 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 2 3 0 0 0 4 1
## 39 158625 2 0 0 0 0 0 0 1 0 0 0 1 0 0 2 0 0 0 1 0 0 0 1
## 40 161973 1 0 0 1 2 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 3
## 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 3 2 2 0 0 0 0 0 0 0 0 0 5
## 4 1 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 1 3 3 0 0 0
## 5 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1 0 0 1 0 0 0
## 6 2 0 0 0 0 5 1 8 6 7 35 18 13 22 20 33 22 15 10 28 29 30 42 41
## 7 10 3 0 5 0 2 4 3 2 11 11 2 0 6 7 10 10 11 10 11 8 7 13 26
## 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 9 0 3 1 11 1 2 0 0 8 2 20 4 5 12 2 33 20 23 2 7 10 6 27 16
## 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11 0 0 2 1 1 0 2 1 0 2 1 0 7 6 3 2 3 2 3 1 1 3 4 4
## 12 1 1 2 2 6 2 3 5 3 4 6 5 2 15 11 13 13 17 38 8 7 11 3 34
## 13 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 2
## 14 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 15 1 5 2 0 4 1 0 1 1 0 4 2 1 0 0 4 1 2 0 9 31 8 3 2
## 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 17 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 2
## 18 0 0 1 0 0 1 0 1 3 1 0 1 2 1 1 0 3 0 3 0 0 1 0 0
## 19 0 3 0 1 0 0 0 0 3 2 3 0 0 0 1 2 0 5 2 0 0 0 0 0
## 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 21 0 1 1 2 2 2 0 2 0 0 2 0 1 1 0 1 0 1 11 3 0 1 1 5
## 22 0 1 4 1 0 0 0 0 0 0 0 0 1 0 0 1 0 2 1 1 1 0 0 0
## 23 0 0 2 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0
## 24 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
## 25 0 0 3 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0
## 26 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 3 0 0 1 0 1 0 1 0
## 27 0 1 1 2 0 0 3 0 0 0 0 0 1 0 1 0 0 0 1 16 1 0 1 0
## 28 0 0 3 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 3 1 1 4 0 1
## 29 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 0 0 0
## 30 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0
## 31 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0
## 32 0 1 3 1 0 30 14 6 9 9 30 0 1 1 14 0 6 16 10 4 15 2 3 34
## 33 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 9 4 1 1 3
## 34 1 0 0 1 2 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 35 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1
## 36 0 0 1 0 0 0 2 0 0 1 1 0 0 1 0 2 2 3 0 1 4 1 0 0
## 37 2 0 2 5 0 1 5 0 2 1 4 2 3 2 6 18 0 2 2 7 5 3 0 2
## 38 0 2 0 1 0 0 1 0 1 1 0 0 0 0 0 2 0 5 0 0 3 1 1 1
## 39 0 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 0 1 3 1 0 0 0 0
## 40 1 3 0 0 1 3 1 0 0 2 4 1 3 4 5 1 2 4 3 1 7 1 1 2
## 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 2 1 2 1 0 0 0 0 1 1 0 1 0 1 1 0 1 1 0 0
## 4 0 0 0 1 0 13 0 4 11 16 10 3 3 9 4 2 3 10 0 17
## 5 2 0 0 0 0 0 0 0 1 0 2 0 0 2 0 3 3 1 0 0
## 6 60 51 42 44 95 82 71 92 71 104 120 65 76 92 199 249 99 241 197 167
## 7 30 29 18 17 14 18 27 28 23 14 3 40 32 14 32 41 28 56 22 18
## 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 9 28 17 8 47 33 38 38 51 33 58 62 36 34 45 29 59 111 59 71 86
## 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11 1 1 8 10 4 14 4 13 16 22 8 12 6 8 7 2 7 12 38 45
## 12 89 35 54 58 110 36 129 21 27 17 38 64 119 93 95 92 97 103 87 17
## 13 0 0 0 0 0 2 0 0 0 0 2 0 2 0 1 0 0 0 0 0
## 14 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 15 10 10 5 6 13 21 13 0 6 2 9 1 5 1 5 2 12 14 17 8
## 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 17 0 0 0 0 0 0 0 2 0 2 1 0 0 1 1 0 0 0 0 0
## 18 1 0 0 1 0 0 2 4 1 1 2 1 1 1 1 2 0 2 0 0
## 19 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 21 2 0 0 0 0 1 1 0 0 3 2 13 2 1 3 4 1 0 0 0
## 22 0 1 0 0 0 0 0 0 2 3 0 0 2 5 0 1 0 0 0 0
## 23 0 0 0 1 0 0 1 0 0 1 0 1 2 2 2 1 0 0 0 0
## 24 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
## 25 0 0 0 0 0 0 0 0 1 1 0 0 2 3 0 1 0 0 0 0
## 26 0 3 1 0 0 2 1 0 0 0 0 1 0 2 3 1 0 0 0 0
## 27 0 0 5 0 0 0 0 0 0 1 0 1 1 2 1 2 0 0 0 0
## 28 0 0 3 0 0 0 0 0 0 6 8 3 3 2 3 5 1 0 0 0
## 29 0 0 0 0 0 2 0 0 1 0 1 0 2 2 3 1 0 0 0 0
## 30 0 6 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 32 2 6 7 0 5 9 22 10 9 4 8 17 5 9 15 4 1 0 0 0
## 33 0 2 1 1 0 1 3 0 4 2 3 1 3 1 0 2 0 0 0 0
## 34 0 2 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0
## 35 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 1 0 0 0
## 36 2 2 3 4 0 1 2 1 0 1 11 8 0 0 3 0 0 0 0 0
## 37 3 1 3 1 6 8 10 9 1 3 9 14 8 5 5 0 0 0 0 0
## 38 1 0 2 0 2 3 1 1 0 0 0 0 0 0 0 0 0 0 0 0
## 39 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0
## 40 2 2 5 10 13 11 18 3 11 6 17 11 3 0 7 0 0 0 0 0
## 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 1 3 0 2 0 1 0 0 0 0 0 0
## 4 16 16 5 22 29 10 12 11 9 4 7 15 0 0
## 5 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 6 206 232 178 111 101 197 112 178 23 19 0 0 0 0
## 7 8 19 19 29 19 1 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 9 110 100 87 59 11 25 47 86 35 33 0 0 0 0
## 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11 19 25 47 28 13 27 5 2 18 14 48 33 22 1
## 12 5 19 36 89 80 35 4 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 14 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 17 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 19 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 21 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 22 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 23 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 24 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 25 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 26 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 27 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 28 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 29 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 32 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 33 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 34 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 35 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 36 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 37 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 38 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 39 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 40 0 0 0 0 0 0 0 0 0 0 0 0 0 0
library(reshape2)
library(ggvis)
df1_counts <- melt(all_months_counts, "article_id")
df1_counts$variable <- as.integer(df1_counts$variable)
ggplot(data = df1_counts, aes(x=variable, y = value, colour = article_id)) + geom_line(aes(group = article_id))
ggsave('img-other/articles-months-since-first-revision-melted-subsample1.png',
plot = last_plot(),
width = 10,
height = 6)
Okay. So since the cut-off date was 2003-01, then the maximum amount of time for revisions is 5 years. Thus we should cut off all the data after 60 months since first revision. Then convert the values to percentages of all revisions.
data_5yrs <- all_months_counts[,0:62]
data_5yrs_norm <- data_5yrs
data_5yrs_norm$totalrevisions <- rowSums(data_5yrs[,2:62])
data_5yrs_norm[,2:62] <- data_5yrs[,2:62]/data_5yrs_norm$totalrevisions
df1_5yrs <- melt(data_5yrs_norm, c("article_id", "totalrevisions"))
df1_5yrs$variable <- as.integer(df1_5yrs$variable)
ggplot(data = df1_5yrs, aes(x=variable, y = value)) + geom_line(aes(group = article_id, color=totalrevisions), size = 0.4) + scale_y_continuous(labels = scales::percent) + ylab("Percent of Article Revisions") + xlab("Months since First Revision")
ggsave('img-other/articles-normalized-by-total-5years-subsample1.png',
plot = last_plot(),
width = 10,
height = 6)
I want to discretize the number of total revisions
data_5yrs_norm$bins <- cut(data_5yrs_norm$totalrevisions, breaks=c(0,2,10,25,100, 500, 2000), labels=c("1-2", "3-10", "10-25", "25-100", "100-500", "500-"))
df1_5yrs <- melt(data_5yrs_norm, c("article_id", "totalrevisions", "bins"))
df1_5yrs$variable <- as.integer(df1_5yrs$variable)
ggplot(data = df1_5yrs, aes(x=variable, y = value)) + geom_line(aes(group = article_id, color=bins, alpha = 0.5), size = 0.5) + scale_y_continuous(labels = scales::percent) + ylab("Percent of Article Revisions") + xlab("Months since First Revision") + scale_color_hue(direction = -1) + facet_grid(~bins) + theme(legend.position="none")
ggsave('img-other/articles-normalized-by-total-5years-facet-subsample1.png',
plot = last_plot(),
width = 10,
height = 6)
ggplot(data = df1_5yrs, aes(x=variable, y = value)) + geom_line(aes(group = article_id, color=bins, alpha = 0.4), size = 0.5) + scale_y_continuous(labels = scales::percent) + ylab("Percent of Article Revisions") + xlab("Months since First Revision") + scale_color_hue(direction = -1) + labs(color = "Total Article Revisions")
ggsave('img-other/articles-normalized-by-total-5years-faceted-subsample1.png',
plot = last_plot(),
width = 10,
height = 6)
factorlevels <- levels(df1_5yrs$bins)
colorsused <- scales::hue_pal(direction = -1)(length(factorlevels))
counter <- 1
for (f in factorlevels){
print(ggplot(data = df1_5yrs[df1_5yrs$bins == f,], aes(x=variable, y = value)) + geom_line(aes(group = article_id), color=colorsused[counter], size = 0.6, alpha = 0.5) + scale_y_continuous(labels = scales::percent, limits = c(0,1)) + ylab("Percent of Article Revisions") + xlab("Months since First Revision") + facet_grid(~bins) + theme(legend.position="none") )
counter <- counter +1
ggsave(paste0("img-other/articles-normalized-by-total-5years-facet-", f, "-subsample1.png"),
plot = last_plot(),
width = 10,
height = 6)
}
What’s happening with articles that have over 500 revisions
factorlevels <- levels(df1_5yrs$bins)
colorsused <- scales::hue_pal(direction = -1)(length(factorlevels))
counter <- 1
for (f in factorlevels){
print(ggplot(data = df1_5yrs[df1_5yrs$bins == f,], aes(x=variable, y = value)) + geom_line(aes(group = article_id), color=colorsused[counter], size = 0.6, alpha = 0.5) + scale_y_continuous(labels = scales::percent, limits = c(0,1)) + ylab("Percent of Article Revisions") + xlab("Months since First Revision") + facet_grid(~bins) + theme(legend.position="none") )
counter <- counter +1
ggsave(paste0("img-other/articles-normalized-by-total-5years-facet-over500revis-", f, "-subsample1.png"),
plot = last_plot(),
width = 10,
height = 6)
}